{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 08 Marginal and conditional distributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"700\" height=\"400\" src=\"https://www.youtube.com/embed/Iw9fEYIpPMA/\" frameborder=\"0\" allowfullscreen></iframe>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<iframe width=\"700\" height=\"400\" src=\"https://www.youtube.com/embed/Iw9fEYIpPMA/\" frameborder=\"0\" allowfullscreen></iframe>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import plotly.graph_objects as go\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "\n",
    "findspark.init()\n",
    "from pyspark.context import SparkContext\n",
    "from pyspark.sql import functions as F\n",
    "from pyspark.sql.session import SparkSession\n",
    "\n",
    "spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/distributions-two-way-tables/v/marginal-distribution-and-conditional-distribution?modal=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Marginal and conditional distributions fig 1](./imgs/01-07-01.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = {\n",
    "    \"#\": [\"80-100\", \"60-79\", \"40-59\", \"20-39\", \"0-19\"],\n",
    "    \"0-20\": [0, 0, 2, 10, 2],\n",
    "    \"21-40\": [4, 20, 4, 2, 0],\n",
    "    \"41-60\": [16, 30, 32, 8, 0],\n",
    "    \">60\": [20, 10, 32, 0, 8],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0-20</th>\n",
       "      <th>21-40</th>\n",
       "      <th>41-60</th>\n",
       "      <th>&gt;60</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>#</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>80-100</th>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>16</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60-79</th>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>30</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40-59</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>32</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20-39</th>\n",
       "      <td>10</td>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0-19</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Total</th>\n",
       "      <td>14</td>\n",
       "      <td>30</td>\n",
       "      <td>86</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0-20  21-40  41-60  >60\n",
       "#                              \n",
       "80-100     0      4     16   20\n",
       "60-79      0     20     30   10\n",
       "40-59      2      4     32   32\n",
       "20-39     10      2      8    0\n",
       "0-19       2      0      0    8\n",
       "Total     14     30     86   70"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(dataset).set_index(\"#\")\n",
    "df = df.append(df.sum().rename(\"Total\"))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+----+-----+-----+---+\n",
      "|     #|0-20|21-40|41-60|>60|\n",
      "+------+----+-----+-----+---+\n",
      "|80-100|   0|    4|   16| 20|\n",
      "| 60-79|   0|   20|   30| 10|\n",
      "| 40-59|   2|    4|   32| 32|\n",
      "| 20-39|  10|    2|    8|  0|\n",
      "|  0-19|   2|    0|    0|  8|\n",
      "+------+----+-----+-----+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n",
    "sdf.registerTempTable(\"sdf_table\")\n",
    "sdf.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0-20</th>\n",
       "      <th>21-40</th>\n",
       "      <th>41-60</th>\n",
       "      <th>&gt;60</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>80-100</th>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60-79</th>\n",
       "      <td>0.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40-59</th>\n",
       "      <td>2.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>32.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20-39</th>\n",
       "      <td>10.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0-19</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Total</th>\n",
       "      <td>14.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>86.0</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Percentage</th>\n",
       "      <td>7.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0-20  21-40  41-60   >60\n",
       "80-100       0.0    4.0   16.0  20.0\n",
       "60-79        0.0   20.0   30.0  10.0\n",
       "40-59        2.0    4.0   32.0  32.0\n",
       "20-39       10.0    2.0    8.0   0.0\n",
       "0-19         2.0    0.0    0.0   8.0\n",
       "Total       14.0   30.0   86.0  70.0\n",
       "Percentage   7.0   15.0   43.0  35.0"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.append(\n",
    "    pd.DataFrame(\n",
    "        [df.iloc[-1] / df.iloc[-1].sum() * 100],\n",
    "        index=[\"Percentage\"],\n",
    "        columns=df.columns,\n",
    "    )\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0-20</th>\n",
       "      <th>21-40</th>\n",
       "      <th>41-60</th>\n",
       "      <th>&gt;60</th>\n",
       "      <th>Total</th>\n",
       "      <th>Percentage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>80-100</th>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60-79</th>\n",
       "      <td>0.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40-59</th>\n",
       "      <td>2.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20-39</th>\n",
       "      <td>10.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0-19</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Total</th>\n",
       "      <td>14.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>86.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>200.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Percentage</th>\n",
       "      <td>7.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0-20  21-40  41-60   >60  Total  Percentage\n",
       "80-100       0.0    4.0   16.0  20.0   40.0         8.0\n",
       "60-79        0.0   20.0   30.0  10.0   60.0        12.0\n",
       "40-59        2.0    4.0   32.0  32.0   70.0        14.0\n",
       "20-39       10.0    2.0    8.0   0.0   20.0         4.0\n",
       "0-19         2.0    0.0    0.0   8.0   10.0         2.0\n",
       "Total       14.0   30.0   86.0  70.0  200.0         NaN\n",
       "Percentage   7.0   15.0   43.0  35.0  100.0         NaN"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"Total\"] = df.sum(axis=1)\n",
    "df[\"Percentage\"] = df[\"Total\"] / df[\"Total\"].sum() * 100\n",
    "df.loc[\"Total\"][\"Percentage\"] = np.nan\n",
    "df.loc[\"Percentage\"][\"Percentage\"] = np.nan\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+----+-----+-----+---+-----+----------+\n",
      "|     #|0-20|21-40|41-60|>60|Total|Percentage|\n",
      "+------+----+-----+-----+---+-----+----------+\n",
      "|80-100|   0|    4|   16| 20|   40|      20.0|\n",
      "| 60-79|   0|   20|   30| 10|   60|      30.0|\n",
      "| 40-59|   2|    4|   32| 32|   70|      35.0|\n",
      "| 20-39|  10|    2|    8|  0|   20|      10.0|\n",
      "|  0-19|   2|    0|    0|  8|   10|       5.0|\n",
      "+------+----+-----+-----+---+-----+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf = sdf.withColumn(\n",
    "    \"Total\", F.col(\"0-20\") + F.col(\"21-40\") + F.col(\"41-60\") + F.col(\">60\")\n",
    ")\n",
    "sdf = sdf.withColumn(\n",
    "    \"Percentage\", F.col(\"Total\") / sdf.select(F.sum(\"Total\")).collect()[0][0] * 100\n",
    ")\n",
    "sdf.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0-20</th>\n",
       "      <th>21-40</th>\n",
       "      <th>41-60</th>\n",
       "      <th>&gt;60</th>\n",
       "      <th>Total</th>\n",
       "      <th>Percentage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Total</th>\n",
       "      <td>14.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>86.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>200.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Percentage</th>\n",
       "      <td>7.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0-20  21-40  41-60   >60  Total  Percentage\n",
       "Total       14.0   30.0   86.0  70.0  200.0         NaN\n",
       "Percentage   7.0   15.0   43.0  35.0  100.0         NaN"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "marginal_dist_v_df = df.iloc[-2:]\n",
    "marginal_dist_v_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Total</th>\n",
       "      <th>Percentage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>80-100</th>\n",
       "      <td>40.0</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60-79</th>\n",
       "      <td>60.0</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40-59</th>\n",
       "      <td>70.0</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20-39</th>\n",
       "      <td>20.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0-19</th>\n",
       "      <td>10.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Total</th>\n",
       "      <td>200.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Percentage</th>\n",
       "      <td>100.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Total  Percentage\n",
       "80-100       40.0         8.0\n",
       "60-79        60.0        12.0\n",
       "40-59        70.0        14.0\n",
       "20-39        20.0         4.0\n",
       "0-19         10.0         2.0\n",
       "Total       200.0         NaN\n",
       "Percentage  100.0         NaN"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "marginal_dist_h_df = df.loc[:, \"Total\":]\n",
    "marginal_dist_h_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-----+----------+\n",
      "|     #|Total|Percentage|\n",
      "+------+-----+----------+\n",
      "|80-100|   40|      20.0|\n",
      "| 60-79|   60|      30.0|\n",
      "| 40-59|   70|      35.0|\n",
      "| 20-39|   20|      10.0|\n",
      "|  0-19|   10|       5.0|\n",
      "+------+-----+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf.select(\"#\", \"Total\", \"Percentage\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>41-60</th>\n",
       "      <th>Percentage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>80-100</th>\n",
       "      <td>16.0</td>\n",
       "      <td>18.604651</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60-79</th>\n",
       "      <td>30.0</td>\n",
       "      <td>34.883721</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40-59</th>\n",
       "      <td>32.0</td>\n",
       "      <td>37.209302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20-39</th>\n",
       "      <td>8.0</td>\n",
       "      <td>9.302326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0-19</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        41-60  Percentage\n",
       "80-100   16.0   18.604651\n",
       "60-79    30.0   34.883721\n",
       "40-59    32.0   37.209302\n",
       "20-39     8.0    9.302326\n",
       "0-19      0.0    0.000000"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conditional_dist_df = pd.DataFrame(df.loc[:, \"41-60\"])\n",
    "conditional_dist_df = conditional_dist_df.iloc[:-2]\n",
    "conditional_dist_df[\"Percentage\"] = (\n",
    "    conditional_dist_df[\"41-60\"] / conditional_dist_df[\"41-60\"].sum() * 100\n",
    ")\n",
    "conditional_dist_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-----+------------------+\n",
      "|     #|41-60|        Percentage|\n",
      "+------+-----+------------------+\n",
      "|80-100|   16|  18.6046511627907|\n",
      "| 60-79|   30|34.883720930232556|\n",
      "| 40-59|   32|  37.2093023255814|\n",
      "| 20-39|    8|  9.30232558139535|\n",
      "|  0-19|    0|               0.0|\n",
      "+------+-----+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sdf.select(\"#\", \"41-60\").withColumn(\n",
    "    \"Percentage\", F.col(\"41-60\") / sdf.select(F.sum(\"41-60\")).collect()[0][0] * 100\n",
    ").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}